#dffffffffffffffff
import gensim
from gensim.models import word2vec
import os

#由于英语和荷兰语二者中训练样本文件名一致，所以只需要获取其中一个文件夹中文件名即可
dir='G:/corpus/first_step/Europarl remove_stop/'
en_dir=dir+'en/'
en_filename=os.listdir(en_dir)
root='Europarl remove_stop/'

#定义伪相关文档组合的方法
def merge(text1,text2):
	if len(text1)>len(text2):
		text1_sp=text1
		text2_sp=text2
	else:
		text1_sp=text2
		text2_sp=text1
	merge_text=[]
	m=int(len(text1_sp)/len(text2_sp))
	n=len(text1_sp)%len(text2_sp)
	j=0
	for i in range(len(text2_sp)):
		merge_text.append(text2_sp[i])
		while j<(i+1)*m:
			merge_text.append(text1_sp[j])
			j+=1
	if n!=0:
		merge_text.extend(text1_sp[j:])
	return merge_text

sentences=[]
for filename in en_filename:
	with open('en'+filename,'r',encoding='utf-8') as rf:
		text1=rf.read()
		text1_sp=text1.split()
	with open('nl'+filename,'r',encoding='utf-8') as pf:
		text2=pf.read()
		text2_sp=text2.split()

	merge_text=merge(text1_sp,text2_sp)
	if len(merge_text)>60:
		sentences.append(merge_text)


#剩下的和we-vs相同

model=word2vec.Word2Vec(sentences, size=600,window=60)

model.save('we_vs2.model')




